<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
<meta http-equiv="X-UA-Compatible" content="IE=9"/>
<title>JSpider Library: src/main/java/com/spider/jspiderlibrary2/RobotsParser.java Source File</title>

<link href="tabs.css" rel="stylesheet" type="text/css"/>
<link href="doxygen.css" rel="stylesheet" type="text/css" />

<link href="search/search.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="jquery.js"></script>
<script type="text/javascript" src="search/search.js"></script>
<script type="text/javascript">
  $(document).ready(function() { searchBox.OnSelectItem(0); });
</script>

</head>
<body>
<div id="top"><!-- do not remove this div! -->


<div id="titlearea">
<table cellspacing="0" cellpadding="0">
 <tbody>
 <tr style="height: 56px;">
  
  
  <td style="padding-left: 0.5em;">
   <div id="projectname">JSpider Library
   
   </div>
   
  </td>
  
  
  
 </tr>
 </tbody>
</table>
</div>

<!-- Generated by Doxygen 1.7.6.1 -->
<script type="text/javascript">
var searchBox = new SearchBox("searchBox", "search",false,'Search');
</script>
  <div id="navrow1" class="tabs">
    <ul class="tablist">
      <li><a href="index.html"><span>Main&#160;Page</span></a></li>
      <li><a href="namespaces.html"><span>Packages</span></a></li>
      <li><a href="annotated.html"><span>Classes</span></a></li>
      <li class="current"><a href="files.html"><span>Files</span></a></li>
      <li>
        <div id="MSearchBox" class="MSearchBoxInactive">
        <span class="left">
          <img id="MSearchSelect" src="search/mag_sel.png"
               onmouseover="return searchBox.OnSearchSelectShow()"
               onmouseout="return searchBox.OnSearchSelectHide()"
               alt=""/>
          <input type="text" id="MSearchField" value="Search" accesskey="S"
               onfocus="searchBox.OnSearchFieldFocus(true)" 
               onblur="searchBox.OnSearchFieldFocus(false)" 
               onkeyup="searchBox.OnSearchFieldChange(event)"/>
          </span><span class="right">
            <a id="MSearchClose" href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" border="0" src="search/close.png" alt=""/></a>
          </span>
        </div>
      </li>
    </ul>
  </div>
  <div id="navrow2" class="tabs2">
    <ul class="tablist">
      <li><a href="files.html"><span>File&#160;List</span></a></li>
    </ul>
  </div>
</div>
<div class="header">
  <div class="headertitle">
<div class="title">src/main/java/com/spider/jspiderlibrary2/RobotsParser.java</div>  </div>
</div><!--header-->
<div class="contents">
<a href="_robots_parser_8java.html">Go to the documentation of this file.</a><div class="fragment"><pre class="fragment"><a name="l00001"></a>00001 <span class="keyword">package </span>com.spider.jspiderlibrary2;
<a name="l00002"></a>00002 
<a name="l00003"></a>00003 <span class="keyword">import</span> java.io.IOException;
<a name="l00004"></a>00004 <span class="keyword">import</span> java.io.InputStream;
<a name="l00005"></a>00005 <span class="keyword">import</span> java.net.MalformedURLException;
<a name="l00006"></a>00006 <span class="keyword">import</span> java.net.URL;
<a name="l00007"></a>00007 <span class="keyword">import</span> java.util.StringTokenizer;
<a name="l00012"></a>00012 <span class="comment">/*</span>
<a name="l00013"></a>00013 <span class="comment"> * RobotsParser clase que parsea el archivo robots.txt de un sitio web</span>
<a name="l00014"></a>00014 <span class="comment"> * Usando siempre la direccion base con protocolo http</span>
<a name="l00015"></a>00015 <span class="comment"> */</span>
<a name="l00016"></a><a class="code" href="classcom_1_1spider_1_1jspiderlibrary2_1_1_robots_parser.html">00016</a> <span class="keyword">public</span> <span class="keyword">class </span><a class="code" href="classcom_1_1spider_1_1jspiderlibrary2_1_1_robots_parser.html">RobotsParser</a> {
<a name="l00017"></a>00017     <span class="comment">//Palabras claves contenidas dentro del archivo robots.txt necesarias</span>
<a name="l00018"></a>00018     <span class="comment">//para realizar el parseo ejemplo http://www.facebook.com/robots.txt</span>
<a name="l00019"></a>00019 
<a name="l00020"></a><a class="code" href="classcom_1_1spider_1_1jspiderlibrary2_1_1_robots_parser.html#a0455347c6ace914cb64885d5cb4e051b">00020</a>     <span class="keyword">public</span> <span class="keyword">static</span> <span class="keyword">final</span> String <a class="code" href="classcom_1_1spider_1_1jspiderlibrary2_1_1_robots_parser.html#a0455347c6ace914cb64885d5cb4e051b">SEARCH</a> = <span class="stringliteral">&quot;Search&quot;</span>;
<a name="l00021"></a><a class="code" href="classcom_1_1spider_1_1jspiderlibrary2_1_1_robots_parser.html#aff7a48bec0d0fa25e624afcc60144f32">00021</a>     <span class="keyword">public</span> <span class="keyword">static</span> <span class="keyword">final</span> String <a class="code" href="classcom_1_1spider_1_1jspiderlibrary2_1_1_robots_parser.html#aff7a48bec0d0fa25e624afcc60144f32">STOP</a> = <span class="stringliteral">&quot;Stop&quot;</span>;
<a name="l00022"></a><a class="code" href="classcom_1_1spider_1_1jspiderlibrary2_1_1_robots_parser.html#a0d4b2f94e6784a8bff783928ef5d5a78">00022</a>     <span class="keyword">public</span> <span class="keyword">static</span> <span class="keyword">final</span> String <a class="code" href="classcom_1_1spider_1_1jspiderlibrary2_1_1_robots_parser.html#a0d4b2f94e6784a8bff783928ef5d5a78">DISALLOW</a> = <span class="stringliteral">&quot;Disallow:&quot;</span>;
<a name="l00023"></a><a class="code" href="classcom_1_1spider_1_1jspiderlibrary2_1_1_robots_parser.html#ad78a55ba0cb474ee527ee4c2a73c3923">00023</a>     <span class="keyword">public</span> <span class="keyword">static</span> <span class="keyword">final</span> <span class="keywordtype">int</span> <a class="code" href="classcom_1_1spider_1_1jspiderlibrary2_1_1_robots_parser.html#ad78a55ba0cb474ee527ee4c2a73c3923">SEARCH_LIMIT</a> = 50;
<a name="l00024"></a>00024 
<a name="l00025"></a><a class="code" href="classcom_1_1spider_1_1jspiderlibrary2_1_1_robots_parser.html#a763eb021063080b7cbfc3a0477b89b38">00025</a>     <span class="keyword">public</span> <span class="keyword">static</span> <span class="keywordtype">boolean</span> <a class="code" href="classcom_1_1spider_1_1jspiderlibrary2_1_1_robots_parser.html#a763eb021063080b7cbfc3a0477b89b38">robotSafe</a>(URL url) {
<a name="l00026"></a>00026         String strHost = url.getHost();
<a name="l00027"></a>00027         <span class="comment">//Donde se encuentra el archivo robots.txt</span>
<a name="l00028"></a>00028         <span class="comment">//http://www.robotstxt.org/robotstxt.html ver estandar aqui</span>
<a name="l00029"></a>00029         String strRobot = <span class="stringliteral">&quot;http://&quot;</span> + strHost + <span class="stringliteral">&quot;/robots.txt&quot;</span>;
<a name="l00030"></a>00030         URL urlRobot;
<a name="l00031"></a>00031         <span class="keywordflow">try</span> {
<a name="l00032"></a>00032             <span class="comment">//Checkea por URL mal formada</span>
<a name="l00033"></a>00033             urlRobot = <span class="keyword">new</span> URL(strRobot);
<a name="l00034"></a>00034         } <span class="keywordflow">catch</span> (MalformedURLException e) {
<a name="l00035"></a>00035             <span class="comment">// URL Mal formada, no confiar devolver false</span>
<a name="l00036"></a>00036             <span class="keywordflow">return</span> <span class="keyword">false</span>;
<a name="l00037"></a>00037         }
<a name="l00038"></a>00038         String strCommands;
<a name="l00039"></a>00039         <span class="keywordflow">try</span> {
<a name="l00040"></a>00040             InputStream urlRobotStream = urlRobot.openStream();
<a name="l00041"></a>00041             <span class="comment">// Lee en todo el archivo, el valor del byte[] queda establecido como parametro definido</span>
<a name="l00042"></a>00042             byte b[] = <span class="keyword">new</span> byte[1000];
<a name="l00043"></a>00043             <span class="keywordtype">int</span> numRead = urlRobotStream.read(b);
<a name="l00044"></a>00044             strCommands = <span class="keyword">new</span> String(b, 0, numRead);
<a name="l00045"></a>00045             <span class="keywordflow">while</span> (numRead != -1) {
<a name="l00046"></a>00046                 numRead = urlRobotStream.read(b);
<a name="l00047"></a>00047                 <span class="keywordflow">if</span> (numRead != -1) {
<a name="l00048"></a>00048                     String newCommands = <span class="keyword">new</span> String(b, 0, numRead);
<a name="l00049"></a>00049                     strCommands += newCommands;
<a name="l00050"></a>00050                 }
<a name="l00051"></a>00051             }
<a name="l00052"></a>00052             urlRobotStream.close();
<a name="l00053"></a>00053         } <span class="keywordflow">catch</span> (IOException e) {
<a name="l00054"></a>00054             <span class="comment">//Sino existe el archivo robots.txt en el sitio web</span>
<a name="l00055"></a>00055             <span class="comment">//crawleo sin limitaciones :)</span>
<a name="l00056"></a>00056             <span class="keywordflow">return</span> <span class="keyword">true</span>;
<a name="l00057"></a>00057         }
<a name="l00058"></a>00058         <span class="comment">// Busco los &quot;Disallow:&quot; dentro del archivo.</span>
<a name="l00059"></a>00059         String strURL = url.getFile();
<a name="l00060"></a>00060         <span class="keywordtype">int</span> index = 0;
<a name="l00061"></a>00061         <span class="keywordflow">while</span> ((index = strCommands.indexOf(<a class="code" href="classcom_1_1spider_1_1jspiderlibrary2_1_1_robots_parser.html#a0d4b2f94e6784a8bff783928ef5d5a78">DISALLOW</a>, index)) != -1) {
<a name="l00062"></a>00062             index += <a class="code" href="classcom_1_1spider_1_1jspiderlibrary2_1_1_robots_parser.html#a0d4b2f94e6784a8bff783928ef5d5a78">DISALLOW</a>.length();
<a name="l00063"></a>00063             String strPath = strCommands.substring(index);
<a name="l00064"></a>00064             StringTokenizer st = <span class="keyword">new</span> StringTokenizer(strPath);
<a name="l00065"></a>00065             <span class="comment">//Empleo StringTokenizer para splitear el path en subsStrings</span>
<a name="l00066"></a>00066             <span class="keywordflow">if</span> (!st.hasMoreTokens()) <span class="comment">//Cuando no hay mas Tokens fuiste.</span>
<a name="l00067"></a>00067             {
<a name="l00068"></a>00068                 <span class="keywordflow">break</span>;
<a name="l00069"></a>00069             }
<a name="l00070"></a>00070             String strBadPath = st.nextToken();
<a name="l00071"></a>00071             <span class="comment">//System.out.println(&quot;The foribidden path is: &quot;+strBadPath);</span>
<a name="l00072"></a>00072             <span class="comment">//Si la url empieza con un disallowed path no es segura.</span>
<a name="l00073"></a>00073             <span class="keywordflow">if</span> (strURL.indexOf(strBadPath) == 0) {
<a name="l00074"></a>00074                 <span class="keywordflow">return</span> <span class="keyword">false</span>;
<a name="l00075"></a>00075             }
<a name="l00076"></a>00076         }
<a name="l00077"></a>00077         <span class="keywordflow">return</span> <span class="keyword">true</span>;
<a name="l00078"></a>00078     }
<a name="l00079"></a>00079 }
</pre></div></div><!-- contents -->
<!-- window showing the filter options -->
<div id="MSearchSelectWindow"
     onmouseover="return searchBox.OnSearchSelectShow()"
     onmouseout="return searchBox.OnSearchSelectHide()"
     onkeydown="return searchBox.OnSearchSelectKey(event)">
<a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(0)"><span class="SelectionMark">&#160;</span>All</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(1)"><span class="SelectionMark">&#160;</span>Classes</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(2)"><span class="SelectionMark">&#160;</span>Namespaces</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(3)"><span class="SelectionMark">&#160;</span>Files</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(4)"><span class="SelectionMark">&#160;</span>Functions</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(5)"><span class="SelectionMark">&#160;</span>Variables</a></div>

<!-- iframe showing the search results (closed by default) -->
<div id="MSearchResultsWindow">
<iframe src="javascript:void(0)" frameborder="0" 
        name="MSearchResults" id="MSearchResults">
</iframe>
</div>



<hr class="footer"/><address class="footer"><small>
Generated on Thu Feb 16 2012 02:56:13 for JSpider Library by &#160;<a href="http://www.doxygen.org/index.html">
<img class="footer" src="doxygen.png" alt="doxygen"/>
</a> 1.7.6.1
</small></address>

</body>
</html>
